clear all 
drop _all 
global seed 12345 
global ctr randgpa topemp secondint wfm techskills  
global dem WhiteFemale MinorityFemale MinorityMale
grstyle init 
grstyle set plain, hor 

//cd "Your Path"

use "IRR_raw_data", clear 

	** create variables for analysis 
	gen FemaleName=(randfirstname_gender=="female")
	gen MaleName=1-FemaleName 
	gen WhiteName=(rand_eth==4)
	gen WhiteMale=(!FemaleName & WhiteName)
	gen MinorityFemale=(!WhiteName & FemaleName)
	gen MinorityMale=(!WhiteName & !FemaleName)
	gen WhiteFemale=(WhiteName & FemaleName)

	
	** use LASSO to predict resume quality 
	gen gpa_dec1=round(randgpa, .1)
	replace gpa_dec1=gpa_dec1*10 
	egen GPAid=group(gpa_dec1)
	egen GPAid2=group(randgpa) 
 	
	lasso linear rating_hire i.GPAid2 topemp secondint wfm techskills, selection(adaptive, ridge) rseed($seed)
 	predict RHire
	xtile tile = RHire, n(10) 
	tab tile
	
 	replace tile=5 if tile==4
	replace tile=9 if tile==8
	egen Tile=group(tile)
	
	sum Tile
	forvalues i=1/`r(max)' {
	gen WhiteMale_t`i'=WhiteMale*(Tile==`i')
	}
	
	
	** estimate the white-male bias for each quality group 
	eststo Prem: reghdfe rating_hire WhiteMale_t* $ctr, a(subjectid majorid resumenum lead1 lead2) vce(r)  
	coefplot Prem , ///
	keep(WhiteMale_t*) level(95) legend(off) vertical yline(0, lc(gray)) mc(black) ciopts(col(black)) ///
	xtitle("Rank of Predicted Quality", margin(t=2)) xlab(1 "0-10%" 2 "10-36%" 3 "36-52%" 4 "52-60%" 5 "60-77%" 6"77-90%" 7 "90-100%", labsize(small)) ytitle("Being a White Man") ylab(, labsize(small) nogrid) name(be2, replace)
	graph export "./output/FigA2_Bias_by_Quality.pdf", replace 
		
		
	** define low-quality resumes (data-driven cutoff)
	gen LQType=(Tile<=2)  
	gen HQType=1-LQType
	gen LQWhiteMale=(WhiteMale & LQType)
	gen HQWhiteMale=(WhiteMale & HQType)
	gen HQMinorityFemale=(MinorityFemale & HQType)
	gen LQMinorityFemale=(MinorityFemale & LQType)
	
	gen whitemale_employer=(rater_ethnicity==3 & !rater_female) 
	gen white_employer=(rater_ethnicity==3)
	gen male_employer=(!rater_female)
	egen diversity=rowtotal(factor_genderdiversity factor_racialdiversity)
	gen HDiversity=(diversity>14)
	egen Diversity=std(diversity)
	egen Quality=std(RHire)

	** create lag variables 
	sort subjectid resumenum
	foreach x in $ctr $dem LQType HQType LQWhiteMale HQWhiteMale WhiteMale WhiteName LQMinorityFemale HQMinorityFemale {
		qui gen `x'_prev=. if resumenum==1
		qui replace `x'_prev=`x'[_n-1] if subjectid==subjectid[_n-1] &  resumenum>1 
		lab var `x'_prev "After `:var lab `x''"	
	}
	
	lab var randgpa_prev "Previous GPA"
	lab var LQWhiteMale_prev "After Low-Quality White Man"
	lab var WhiteMale_prev "After White Man"	
	lab var LQType_prev "After Low-Quality"
	lab var wfm "Work for Money"
	lab var randgpa "GPA"
	lab var topemp "Top Internship"
	lab var secondint "Second Internship"
	lab var WhiteFemale "White Woman"
	lab var WhiteMale "White Man"
	lab var MinorityMale "Minority Man"
	lab var MinorityFemale "Minority Woman"
	lab var rating_hire "Hiring Interest"  
	lab var techskills "Technical Skills"
	lab var WhiteFemale "White Female"
	lab var MinorityFemale "Non-White Female"
	lab var MinorityMale "Non-White Male"  
		
	save "IRR_cleaned_data.dta", replace 	
 